library(tidyverse)
library(data.table)
library(mice)
library(skimr)
library(corrplot)
library(cowplot)
bankraw <- read.csv("https://raw.githubusercontent.com/JaclynCoate/6372_Project_2/master/Data/bank-additional-full.csv", header = TRUE, sep = ";", strip.white = TRUE)
str(bankraw)
## 'data.frame':    41188 obs. of  21 variables:
##  $ age           : int  56 57 37 40 56 45 59 41 24 25 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 4 8 8 1 8 8 1 2 10 8 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 2 2 2 2 2 2 2 3 3 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 4 4 2 4 3 6 8 6 4 ...
##  $ default       : Factor w/ 3 levels "no","unknown",..: 1 2 1 1 1 2 1 2 1 1 ...
##  $ housing       : Factor w/ 3 levels "no","unknown",..: 1 1 3 1 1 1 1 1 3 3 ...
##  $ loan          : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 3 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ duration      : int  261 149 226 151 307 198 139 217 380 50 ...
##  $ campaign      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
head(bankraw)
##   age       job marital   education default housing loan   contact month
## 1  56 housemaid married    basic.4y      no      no   no telephone   may
## 2  57  services married high.school unknown      no   no telephone   may
## 3  37  services married high.school      no     yes   no telephone   may
## 4  40    admin. married    basic.6y      no      no   no telephone   may
## 5  56  services married high.school      no      no  yes telephone   may
## 6  45  services married    basic.9y unknown      no   no telephone   may
##   day_of_week duration campaign pdays previous    poutcome emp.var.rate
## 1         mon      261        1   999        0 nonexistent          1.1
## 2         mon      149        1   999        0 nonexistent          1.1
## 3         mon      226        1   999        0 nonexistent          1.1
## 4         mon      151        1   999        0 nonexistent          1.1
## 5         mon      307        1   999        0 nonexistent          1.1
## 6         mon      198        1   999        0 nonexistent          1.1
##   cons.price.idx cons.conf.idx euribor3m nr.employed  y
## 1         93.994         -36.4     4.857        5191 no
## 2         93.994         -36.4     4.857        5191 no
## 3         93.994         -36.4     4.857        5191 no
## 4         93.994         -36.4     4.857        5191 no
## 5         93.994         -36.4     4.857        5191 no
## 6         93.994         -36.4     4.857        5191 no
tail(bankraw)
##       age         job marital           education default housing loan
## 41183  29  unemployed  single            basic.4y      no     yes   no
## 41184  73     retired married professional.course      no     yes   no
## 41185  46 blue-collar married professional.course      no      no   no
## 41186  56     retired married   university.degree      no     yes   no
## 41187  44  technician married professional.course      no      no   no
## 41188  74     retired married professional.course      no     yes   no
##        contact month day_of_week duration campaign pdays previous
## 41183 cellular   nov         fri      112        1     9        1
## 41184 cellular   nov         fri      334        1   999        0
## 41185 cellular   nov         fri      383        1   999        0
## 41186 cellular   nov         fri      189        2   999        0
## 41187 cellular   nov         fri      442        1   999        0
## 41188 cellular   nov         fri      239        3   999        1
##          poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m
## 41183     success         -1.1         94.767         -50.8     1.028
## 41184 nonexistent         -1.1         94.767         -50.8     1.028
## 41185 nonexistent         -1.1         94.767         -50.8     1.028
## 41186 nonexistent         -1.1         94.767         -50.8     1.028
## 41187 nonexistent         -1.1         94.767         -50.8     1.028
## 41188     failure         -1.1         94.767         -50.8     1.028
##       nr.employed   y
## 41183      4963.6  no
## 41184      4963.6 yes
## 41185      4963.6  no
## 41186      4963.6  no
## 41187      4963.6 yes
## 41188      4963.6  no

Question of Interest:

EDA to determine narrow down variables to use for the Logistic Regression model

Properly naming response variable

setnames(bankraw, "y", "Subscription")
colnames(bankraw)
##  [1] "age"            "job"            "marital"        "education"     
##  [5] "default"        "housing"        "loan"           "contact"       
##  [9] "month"          "day_of_week"    "duration"       "campaign"      
## [13] "pdays"          "previous"       "poutcome"       "emp.var.rate"  
## [17] "cons.price.idx" "cons.conf.idx"  "euribor3m"      "nr.employed"   
## [21] "Subscription"

Removing logically irrelevant variables

  • Upon reviewing the available metrics, there are certain variabels that would not make logical sense as a contribution to creating a logisitc regression model
str(bankraw)
## 'data.frame':    41188 obs. of  21 variables:
##  $ age           : int  56 57 37 40 56 45 59 41 24 25 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 4 8 8 1 8 8 1 2 10 8 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 2 2 2 2 2 2 2 3 3 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 4 4 2 4 3 6 8 6 4 ...
##  $ default       : Factor w/ 3 levels "no","unknown",..: 1 2 1 1 1 2 1 2 1 1 ...
##  $ housing       : Factor w/ 3 levels "no","unknown",..: 1 1 3 1 1 1 1 1 3 3 ...
##  $ loan          : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 3 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ duration      : int  261 149 226 151 307 198 139 217 380 50 ...
##  $ campaign      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
##  $ Subscription  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
#Dropping logical irrelevant variables: "contact", "campaign", "pdays", "previous"
bankraw2 <- select(bankraw, -c("pdays", "previous", "contact", "campaign"))
head(bankraw2)
##   age       job marital   education default housing loan month day_of_week
## 1  56 housemaid married    basic.4y      no      no   no   may         mon
## 2  57  services married high.school unknown      no   no   may         mon
## 3  37  services married high.school      no     yes   no   may         mon
## 4  40    admin. married    basic.6y      no      no   no   may         mon
## 5  56  services married high.school      no      no  yes   may         mon
## 6  45  services married    basic.9y unknown      no   no   may         mon
##   duration    poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m
## 1      261 nonexistent          1.1         93.994         -36.4     4.857
## 2      149 nonexistent          1.1         93.994         -36.4     4.857
## 3      226 nonexistent          1.1         93.994         -36.4     4.857
## 4      151 nonexistent          1.1         93.994         -36.4     4.857
## 5      307 nonexistent          1.1         93.994         -36.4     4.857
## 6      198 nonexistent          1.1         93.994         -36.4     4.857
##   nr.employed Subscription
## 1        5191           no
## 2        5191           no
## 3        5191           no
## 4        5191           no
## 5        5191           no
## 6        5191           no
view(bankraw2)

NA Evaluation and Drop

#Checking for NAs
md.pattern(bankraw2)
##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##       age job marital education default housing loan month day_of_week
## 41188   1   1       1         1       1       1    1     1           1
##         0   0       0         0       0       0    0     0           0
##       duration poutcome emp.var.rate cons.price.idx cons.conf.idx
## 41188        1        1            1              1             1
##              0        0            0              0             0
##       euribor3m nr.employed Subscription  
## 41188         1           1            1 0
##               0           0            0 0
#Results show no NAs

Zero variance variable check - all show variance so remain in model

skim(bankraw2)
Data summary
Name bankraw2
Number of rows 41188
Number of columns 17
_______________________
Column type frequency:
factor 10
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
job 0 1 FALSE 12 adm: 10422, blu: 9254, tec: 6743, ser: 3969
marital 0 1 FALSE 4 mar: 24928, sin: 11568, div: 4612, unk: 80
education 0 1 FALSE 8 uni: 12168, hig: 9515, bas: 6045, pro: 5243
default 0 1 FALSE 3 no: 32588, unk: 8597, yes: 3
housing 0 1 FALSE 3 yes: 21576, no: 18622, unk: 990
loan 0 1 FALSE 3 no: 33950, yes: 6248, unk: 990
month 0 1 FALSE 10 may: 13769, jul: 7174, aug: 6178, jun: 5318
day_of_week 0 1 FALSE 5 thu: 8623, mon: 8514, wed: 8134, tue: 8090
poutcome 0 1 FALSE 3 non: 35563, fai: 4252, suc: 1373
Subscription 0 1 FALSE 2 no: 36548, yes: 4640

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 40.02 10.42 17.00 32.00 38.00 47.00 98.00 ▅▇▃▁▁
duration 0 1 258.29 259.28 0.00 102.00 180.00 319.00 4918.00 ▇▁▁▁▁
emp.var.rate 0 1 0.08 1.57 -3.40 -1.80 1.10 1.40 1.40 ▁▃▁▁▇
cons.price.idx 0 1 93.58 0.58 92.20 93.08 93.75 93.99 94.77 ▁▆▃▇▂
cons.conf.idx 0 1 -40.50 4.63 -50.80 -42.70 -41.80 -36.40 -26.90 ▅▇▁▇▁
euribor3m 0 1 3.62 1.73 0.63 1.34 4.86 4.96 5.04 ▅▁▁▁▇
nr.employed 0 1 5167.04 72.25 4963.60 5099.10 5191.00 5228.10 5228.10 ▁▁▃▁▇

Continuous Variable Multicollinearity Check

  • Multicollinearity will weaken the model
  • At first glance there does seem to be some correlation between a few of the continuous variables
  • When highglighting the yes versus no result for signing up, we cannot see a clear separation of anykind. This will lead us away from utilizing the principal componenet analysis technique for variable selection
view(bankraw2)
#Reducing to only continuous variables and graphing by continuous variables, then colored by response in order to determine if there is separation of results and the ability to utilzie PCA
bankraw2 %>% keep(is.numeric) %>% pairs(,col=bankraw2$Subscription)

# find the PCA
pcaBank <- prcomp(bankraw2 %>% keep(is.numeric), center = TRUE, scale = TRUE)
summary(pcaBank)
## Importance of components:
##                          PC1    PC2    PC3    PC4     PC5    PC6     PC7
## Standard deviation     1.863 1.0585 1.0006 0.9292 0.71194 0.1586 0.10344
## Proportion of Variance 0.496 0.1600 0.1430 0.1234 0.07241 0.0036 0.00153
## Cumulative Proportion  0.496 0.6561 0.7991 0.9225 0.99488 0.9985 1.00000
# create scatterplot of PCA, colored by Subscription
pcaDf <- data.frame(pcaBank$x)
cbind(bankraw2, pcaDf) %>% ggplot(aes(x=PC1, y=PC2, color=Subscription, shape=Subscription)) + scale_shape_manual(values=c(1, 18)) + geom_point(alpha = 0.75) + ggtitle("PCA Plot - Colored by Subscription")

  • To additionally check we will run a correlation matrix
    • Using the correlation matrix we can much more clearly see high correlation
      • cons.price.idx : nr.employed
      • cons.price.idx : emp.var.rate
      • cons.price.idx : euribor3m
      • nr.employed : emp.var.rate
      • nr.employed : euibor3m
      • emp.var.rate : euribor3m
densityPlots <- function(df, explanatory, response){
df %>% ggplot(aes_string(x = explanatory, fill = response)) + geom_density(alpha=0.5)
}

densityPlotsList <- lapply(bankraw2 %>% keep(is.numeric) %>% colnames, function(x) densityPlots(bankraw2, x, "Subscription"))

for(i in densityPlotsList){
  print(i)
}

#densityPlots(bankraw2, "age", "Subscription")
#Plot numeric variables v numeric variables
bankraw2 %>% keep(is.numeric) %>% cor %>% corrplot("upper", addCoef.col = "white", number.digits = 2, number.cex = 0.5, method="square", order="hclust", tl.srt=45, tl.cex = 0.8)

#Removing reviews_per_month due to high correlation of is and number_of_reviews
bank3 <- select(bankraw2, -c("cons.price.idx", "nr.employed", "emp.var.rate"))
skim(bank3)
Data summary
Name bank3
Number of rows 41188
Number of columns 14
_______________________
Column type frequency:
factor 10
numeric 4
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
job 0 1 FALSE 12 adm: 10422, blu: 9254, tec: 6743, ser: 3969
marital 0 1 FALSE 4 mar: 24928, sin: 11568, div: 4612, unk: 80
education 0 1 FALSE 8 uni: 12168, hig: 9515, bas: 6045, pro: 5243
default 0 1 FALSE 3 no: 32588, unk: 8597, yes: 3
housing 0 1 FALSE 3 yes: 21576, no: 18622, unk: 990
loan 0 1 FALSE 3 no: 33950, yes: 6248, unk: 990
month 0 1 FALSE 10 may: 13769, jul: 7174, aug: 6178, jun: 5318
day_of_week 0 1 FALSE 5 thu: 8623, mon: 8514, wed: 8134, tue: 8090
poutcome 0 1 FALSE 3 non: 35563, fai: 4252, suc: 1373
Subscription 0 1 FALSE 2 no: 36548, yes: 4640

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 40.02 10.42 17.00 32.00 38.00 47.00 98.00 ▅▇▃▁▁
duration 0 1 258.29 259.28 0.00 102.00 180.00 319.00 4918.00 ▇▁▁▁▁
cons.conf.idx 0 1 -40.50 4.63 -50.80 -42.70 -41.80 -36.40 -26.90 ▅▇▁▇▁
euribor3m 0 1 3.62 1.73 0.63 1.34 4.86 4.96 5.04 ▅▁▁▁▇
  • AFter reviewing the need to remove the below variables is clear
    • “cons.price.idx”, “nr.employed”, “emp.var.rate”
  • See correlation matrix below after correlated continuous variables have been removed
#Plot numeric continuous variables to double check all correlated values have been removed
bank3 %>% keep(is.numeric) %>% cor %>% corrplot("upper", addCoef.col = "white", number.digits = 2, number.cex = 0.5, method="square", order="hclust", tl.srt=45, tl.cex = 0.8)

  • Seeing highly correlated variables we are going to use PCA in order which numeric variables should be leveraged for our logistic regression

Categorical Variable Review

# 1. Name target variable
targetCatCat <- "Subscription"

# 2. Name explanatory variable
explanatory <- bank3 %>% keep(is.factor) %>% colnames

# 3. Create function
numCatCat <- function(df, explanatory, response) {
  ggplot(data = df) +geom_bar(aes_string(x = explanatory, fill = response), position = "fill", alpha = 0.9) + coord_flip() + xlab(explanatory)
}

  # 3a. Example of working function above
  # numCatCat(bank3, explanatory = "education", response = "Subscription")


# 4. Create plot list for plot_grid function to reference
plotlistCatCat <- lapply(explanatory, function(x) numCatCat(bank3, x, targetCatCat))

# output plots in a loop
for(i in plotlistCatCat){
  print(i)
}

  • Singular break downs of the above function
head(bank3)
##   age       job marital   education default housing loan month day_of_week
## 1  56 housemaid married    basic.4y      no      no   no   may         mon
## 2  57  services married high.school unknown      no   no   may         mon
## 3  37  services married high.school      no     yes   no   may         mon
## 4  40    admin. married    basic.6y      no      no   no   may         mon
## 5  56  services married high.school      no      no  yes   may         mon
## 6  45  services married    basic.9y unknown      no   no   may         mon
##   duration    poutcome cons.conf.idx euribor3m Subscription
## 1      261 nonexistent         -36.4     4.857           no
## 2      149 nonexistent         -36.4     4.857           no
## 3      226 nonexistent         -36.4     4.857           no
## 4      151 nonexistent         -36.4     4.857           no
## 5      307 nonexistent         -36.4     4.857           no
## 6      198 nonexistent         -36.4     4.857           no
  • Upon reviewing all of the Categorical variables we can clearly remove the below variables
    • marital, housing, loan, day_of_week
  • While the below variables seem to show strong correlation with the response variable
    • job, eduction, default, month, poutcome
bank4 <- select(bank3, -c("marital", "housing", "loan", "day_of_week"))

Summary Check on Variables

summary(bank4)
##       age                 job                      education    
##  Min.   :17.00   admin.     :10422   university.degree  :12168  
##  1st Qu.:32.00   blue-collar: 9254   high.school        : 9515  
##  Median :38.00   technician : 6743   basic.9y           : 6045  
##  Mean   :40.02   services   : 3969   professional.course: 5243  
##  3rd Qu.:47.00   management : 2924   basic.4y           : 4176  
##  Max.   :98.00   retired    : 1720   basic.6y           : 2292  
##                  (Other)    : 6156   (Other)            : 1749  
##     default          month          duration             poutcome    
##  no     :32588   may    :13769   Min.   :   0.0   failure    : 4252  
##  unknown: 8597   jul    : 7174   1st Qu.: 102.0   nonexistent:35563  
##  yes    :    3   aug    : 6178   Median : 180.0   success    : 1373  
##                  jun    : 5318   Mean   : 258.3                      
##                  nov    : 4101   3rd Qu.: 319.0                      
##                  apr    : 2632   Max.   :4918.0                      
##                  (Other): 2016                                       
##  cons.conf.idx     euribor3m     Subscription
##  Min.   :-50.8   Min.   :0.634   no :36548   
##  1st Qu.:-42.7   1st Qu.:1.344   yes: 4640   
##  Median :-41.8   Median :4.857               
##  Mean   :-40.5   Mean   :3.621               
##  3rd Qu.:-36.4   3rd Qu.:4.961               
##  Max.   :-26.9   Max.   :5.045               
## 
write.csv(bank4, "/Users/michaelstephan/Desktop/SMU/spring 2020/Applied Statistics/Project 2/6372_Project_2/Data/bank-model input-no continuous.csv")

Adding new interactions

# job and education
bankraw %>% mutate(new = paste(job, education, sep="_")) %>% ggplot(aes(x=new, fill = Subscription)) + geom_bar(position="fill") + coord_flip() + ggtitle("Job and Education Colored by Subscription")

# month and marketing campaign
bankraw %>% mutate(new = paste(month, poutcome, sep="_")) %>% ggplot(aes(x=new, fill = Subscription)) + geom_bar(position="fill") + coord_flip() + ggtitle("Month and Marketing Campaign Colored by Subscription")

# month and marketing campaign broken down by successful months
bankraw %>% mutate(new = paste(case_when(month=="sep" ~1, month=="oct" ~1, month=="mar"~1, month=="dec"~1, TRUE~0), poutcome, sep="_")) %>% ggplot(aes(x=new, fill = Subscription)) + geom_bar(position="fill") + coord_flip() + ggtitle("Month Success and Marketing Campaign Colored by Subscription")

# find job breakdown
bankraw %>% group_by(job) %>% tally() %>% ggplot(aes(x="", y=n, fill = job)) + geom_bar(stat="identity") + coord_polar("y", start=0) + ggtitle("Percent Job Breakdown (Targeting the Majority")

# job breakdown of those who subscribed
bankraw %>% filter(Subscription == "yes") %>% group_by(job) %>% tally() %>% ggplot(aes(x="", y=n, fill = job)) + geom_bar(stat="identity") + coord_polar("y", start=0) + ggtitle("Percent Job Breakdown (Targeting Yes Subscriptions)")

# job and age (seniors)
bankraw %>% mutate(senior = case_when(age >= 65 ~ "senior", TRUE ~ "not senior")) %>% mutate(new = paste(job, senior, sep="_")) %>% ggplot(aes(x=new, fill = Subscription)) + geom_bar(position="fill") + coord_flip() + ggtitle("Job and Age Colored by Subscription")

# see breakdown by position count
bankraw %>% mutate(senior = case_when(age >= 65 ~ "senior", TRUE ~ "not senior")) %>% mutate(new = paste(job, senior, sep="_")) %>% group_by(new) %>% tally() %>% ggplot(aes(x="", y=n, fill = new)) + geom_bar(stat="identity") + coord_polar("y", start=0) + ggtitle("Job and Age Percent Breakdown (Targeting the Majority)")